################################################################################
# Replication of all analyses in
# ``Audits of the 2020 American Election Show an Accurate Vote Count''
# Baltz, Gonzalez, Guo, Jaffe, Stewart III, PNAS
#
# written by sbaltz at mit
#	begun march 2024 (this version)
#	last edited march 2025
#
# Please direct any questions to Samuel Baltz at sbaltz@umich.edu
################################################################################
library(colorspace) #it will be convenient to adjust_transparency


################################################################################
# Global variables and setup
################################################################################
DF_DIR <- '../data_for_analysis/'
setwd(DF_DIR)
FIG_DIR <- '../figures/'

sinkfile <- file(paste0(DF_DIR,'analysis_log.txt'))
sink(sinkfile, type='output')
sink(sinkfile, type='message')

au <- read.csv('all_audits.csv')
cand <- au[!is.na(au$audited),]
cand <- cand[cand$audited != 0,]
ball <- au[!is.na(au$ballots),]

totals <- read.csv('all_totals.csv')

TOTAL_BIDEN <- 81283501 #The official count of votes cast for Biden
TOTAL_TRUMP <- 74223975 #The official count of votes cast for Trump

PARTISAN_OFFICES <- c('US PRESIDENT', 'US SENATE', 'US HOUSE', 'GOVERNOR',
		      'STATE SENATE', 'STATE HOUSE')

NUM_PLOT_LOCS <- 10

#Custom colour definitions
DEM_COL_PRE <- rgb(57/255,127/255,255/255,0.85)
DEM_COL_MID <- rgb(57/255,127/255,255/255,1/3)
DEM_COL_POST <- rgb(57/255,127/255,255/255,0.5)
GOP_COL_PRE <- rgb(255/255,57/255,57/255,0.85)
GOP_COL_MID <- rgb(255/255,57/255,57/255,1/3)
GOP_COL_POST <- rgb(255/255,57/255,57/255,0.5)
LIB_COL_PRE <- rgb(242/255,255/255,108/255,0.85)
LIB_COL_POST <- rgb(242/255,255/255,108/255,0.5)
OTH_COL_PRE <- rgb(184/255,184/255,184/255,0.85)
OTH_COL_POST <- rgb(184/255,184/255,184/255,0.5)
NON_COL_PRE <- rgb(0/255,0/255,0/255,0.85)
NON_COL_POST <- rgb(0/255,0/255,0/255,0.5)

#Set a canonical order for parties and their colours
PTY_ORDER <- c('DEMOCRAT','REPUBLICAN','LIBERTARIAN','OTHER','NONE')
#For grouped barplots
COL_ORDER_GROUP <- c(DEM_COL_PRE,
		     DEM_COL_POST,
		     GOP_COL_PRE,
		     GOP_COL_POST,
		     LIB_COL_PRE,
		     LIB_COL_POST,
		     OTH_COL_PRE,
		     OTH_COL_POST,
		     NON_COL_PRE,
		     NON_COL_POST
		   )

#We suppress warnings so that the analysis log is clear of irrelevant warnings
# and easier to read. To verify that the warnings produced are not relevant to
# the results of the analysis, change this switch
options(warn=-1)


################################################################################
# Global functions
################################################################################
ClnNum <- function(theNum){
  theNum <- format(theNum, big.mark=",")
  return(theNum)
}

GenLabs <- function(locs){
  stInds <- substr(locs,1,2)
  indOne <- unname(sapply(stInds,FUN=substr,start=1,stop=1))
  indTwo <- unname(sapply(stInds,FUN=substr,start=2,stop=2))
  prefs <- paste0(indOne,'.',indTwo)
  locs <- round(locs,0)
  indLens <- nchar(locs)
  labs <- paste0(prefs,"%.%10^",indLens-1)
  labs <- parse(text=labs)
  return(labs)
}

MakePtysShort <- function(fullNames){
  fullNames[fullNames == 'DEMOCRAT'] <- 'DEM'
  fullNames[fullNames == 'REPUBLICAN'] <- 'GOP'
  fullNames[fullNames == 'LIBERTARIAN'] <- 'LIB'
  return(fullNames)
}


################################################################################
# Print the empirical results that are summarized in the paper
################################################################################
cat("NUMBER OF REGIONAL GOVERNMENTS:\n")
localities <- union(paste0(cand$state,cand$county),
		    paste0(ball$state,ball$county))
#We manually add 4 counties from Texas, since that state does not appear in the
# candidate- or ballot-level dataframes, as we manually obtained information
# about total ballots cast only
cat(length(localities)+4,"\n\n")

cat("STATES INCLUDED IN THE ANALYSIS (EXCEPT TEXAS):\n")
nStates <- length(unique(c(cand$state,ball$state)))
print(sort(unique(c(cand$state,ball$state,'TEXAS'))))
cat("\n")
cat("NUMBER OF STATES IN THE ANALYSIS:\n")
cat(paste0("\t",nStates+1,"\n\n"))

cat("NUMBER OF CANDIDATE-LEVEL VOTES AUDITED:\n")
audited_cand <- sum(cand$audited, na.rm=TRUE)
cat(paste0("\t",ClnNum(audited_cand),"\n\n"))

#To obtain the total number of ballots, we need to sum the number of ballots in
# the ballot-level dataset. This includes every audit where the measure
# provided to us was the number of discrepancies found in some total number of
# ballots. However, there is one audit that provided slightly less data: just
# the total number of ballots counted in the original vote count, and in the
# audit. That case was Texas. So we need to also add the total number of ballots
# audited in Texas from the total-level dataset. Adding any states other than
# Texas would double-count those states' ballots.
cat("NUMBER OF BALLOTS CHECKED AT THE BALLOT LEVEL, WITHOUT TEXAS:\n")
nBallots <- sum(ball$ballots, na.rm=TRUE)
cat(paste0("\t",ClnNum(nBallots),"\n\n"))
cat("NUMBER OF BALLOTS CHECKED AT THE BALLOT LEVEL, INCLUDING TEXAS:\n")
nBallots <- nBallots + sum(totals$audited[totals$state == 'TEXAS'])
cat(paste0("\t",ClnNum(nBallots),"\n\n"))

#Count the proportion of presidential/presidential nominees' votes audited
nPresAll <- sum(cand$audited[cand$office == 'US PRESIDENT'])

cat("PROPORTION OF ALL TRUMP VOTES AUDITED:\n")
nTrump <- sum(cand$audited[cand$candidate == 'DONALD J TRUMP'],na.rm=TRUE)
propTrump <- nTrump/TOTAL_TRUMP
cat(paste0("\t",ClnNum(propTrump),"\n\n"))

cat("PROPORTION OF ALL BIDEN VOTES AUDITED:\n") 
nBiden <- sum(cand$audited[cand$candidate == 'JOSEPH R BIDEN'],na.rm=TRUE) 
propBiden <- nBiden/TOTAL_BIDEN
cat(paste0("\t",ClnNum(propBiden),"\n\n")) 

cat(" --- \n")
cat("\tWe calculate the mean county-level presidential margin shift below\n")
cat("\tunder the label: MEAN COUNTY-LEVEL SHIFT TOWARDS TRUMP\n")
cat(" --- \n\n")

#Aggregate to candidate-level
candRes <- aggregate(list(cand$original,cand$audited),
  		      by=list(cand$cand_id,cand$office_cat,cand$party),
  		      FUN=sum)
names(candRes) <- c('cand','office_cat','party','original','audited')
candRes$diff <- candRes$audited - candRes$original
candRes$propDiff <- candRes$diff/candRes$audited
candRes <- candRes[candRes$cand != '',]

#The median proportional change in a candidate's vote total as result of an
# audit
cat("MEDIAN PROPORTIONAL CHANGE BY CANDIDATE:\n")
medianCandErrs <- median(candRes$propDiff)
cat(paste0("\t",ClnNum(medianCandErrs),"\n\n"))

#Subsetting to candidates who have at least 1000 votes. Note that this not
# exactly what we refer to in the text when we refer to ``jurisdictions that
# audited at least a thousand votes for a candidate'' -- that is intended as
# a summary of Figure 5 and can be easily verified by inspection. However
# the following sum is of course closely related
cat("MEDIAN PROPORTIONAL CHANGE FOR CANDIDATES WITH AT LEAST 1,000 VOTES:\n")
cat("\t",median(abs(candRes$propDiff[candRes$audited >= 1000])),"\n\n")

#Ballot-level results
cat("SHARE OF BALLOTS IN BALLOT-LEVEL AUDITS WITH ERRORS:\n")
meanBallotErrs <-sum(abs(ball$issues))/sum(ball$ballots)
cat(paste0("\t",ClnNum(meanBallotErrs),"\n\n"))

cat("NUMBER OF PRESIDENTIAL VOTES AUDITED:\n")
cat(paste0("\t",ClnNum(nPresAll),"\n\n"))

cat("NUMBER OF TRUMP VOTES AUDITED:\n")
cat(paste0("\t",ClnNum(nTrump),"\n\n"))

cat("NUMBER OF BIDEN VOTES AUDITED:\n") 
cat(paste0("\t",ClnNum(nBiden),"\n\n")) 

cat(" --- \n")
cat("\tWe calculate the total (not county-level) presidential margin shift \n")
cat("\tbelow under the label: OVERALL PROPORTIONAL SHIFT TOWARDS TRUMP\n")
cat(" --- \n\n")

#Calculate the number of candidates by combining state, office, and name
cat("TOTAL NUMBER OF CANDIDATES:\n")
cat("\t",length(unique(paste0(cand$state,cand$office,cand$candidate))),"\n\n")

#Lower bound on the number of individual voters in our dataset, obtained by
# summing votes for U.S. President and then adding the number of ballots in the
# (mutually exclusive) ballot-level data
cat("LOWER BOUND ON THE NUMBER OF VOTERS:\n")
lowerBound <- sum(cand$audited[cand$office == 'US PRESIDENT']) + nBallots
cat(paste0("\t",ClnNum(lowerBound),"\n\n"))

#Here we retain some figures that were useful for us in forming a broad
# impression of the data, but which are not directly cited in the paper.
# Uncomment the lines which have been commented out to inspect these values.
#cat("Trump audited votes by state:\n")
trumpTotals <- cand[cand$candidate == 'DONALD J TRUMP',]
trumpTotals <- trumpTotals[,c('state','audited')]
trumpTotals <- aggregate(trumpTotals$audited,by=list(trumpTotals$state),FUN=sum)
names(trumpTotals) <- c('state','audited')
trumpTotals <- trumpTotals[order(-trumpTotals$audited),]
trumpTotals$total <- trumpTotals$audited/sum(trumpTotals$audited)
cat("TRUMP VOTES AUDITED BY STATE:\n")
print(trumpTotals)
cat("\n")

#cat("Proportional Trump errors:\n")
totalTrumpErrs <- sum(cand$difference[cand$candidate == 'DONALD J TRUMP'],
		      na.rm=TRUE)
totalTrump <- sum(cand$audited[cand$candidate == 'DONALD J TRUMP'],
		  na.rm=TRUE)
propTrumpErr <- totalTrumpErrs/totalTrump
#cat(paste0("\t",ClnNum(propTrumpErr),"\n\n"))

#cat("Median Trump errors:\n")
medianTrumpErrs <- median(cand$difference[cand$candidate == 'DONALD J TRUMP']/
  		          cand$audited[cand$candidate == 'DONALD J TRUMP'],
	      	          na.rm = TRUE
		         )
#cat(paste0("\t",ClnNum(medianTrumpErrs),"\n\n"))

#cat("Biden audited votes by state:\n")
bidenTotals <- cand[cand$candidate == 'JOSEPH R BIDEN',]
bidenTotals <- bidenTotals[,c('state','audited')]
bidenTotals <- aggregate(bidenTotals$audited,by=list(bidenTotals$state),FUN=sum)
names(bidenTotals) <- c('state','audited')
bidenTotals <- bidenTotals[order(-bidenTotals$audited),]
bidenTotals$total <- bidenTotals$audited/sum(bidenTotals$audited)
cat("BIDEN VOTES AUDITED BY STATE:\n")
print(bidenTotals)
cat("\n")

#cat("Mean Biden errors:\n") 
meanBidenErrs <- mean(cand$difference[cand$candidate == 'JOSEPH R BIDEN']/ 
                      cand$audited[cand$candidate == 'JOSEPH R BIDEN'], 
                      na.rm = TRUE 
                     ) 
#cat(paste0("\t",ClnNum(meanBidenErrs),"\n\n")) 
 
#cat("Median Biden errors:\n") 
medianBidenErrs <- median(cand$difference[cand$candidate == 'JOSEPH R BIDEN']/ 
                          cand$audited[cand$candidate == 'JOSEPH R BIDEN'], 
                          na.rm = TRUE 
                         ) 
#cat(paste0("\t",ClnNum(medianBidenErrs),"\n\n")) 

#cat("Total Presidential votes audited by state:\n") 
presTotals <- cand[cand$office == 'US PRESIDENT',]
presTotals <- presTotals[,c('state','audited')]
presTotals <- aggregate(presTotals$audited,by=list(presTotals$state),FUN=sum)
names(presTotals) <- c('state','total_audited')
#Now merge those three together
names(bidenTotals)[names(bidenTotals) == 'audited'] <- 'biden_audited'
bidenTotals <- bidenTotals[,c('state','biden_audited')]
names(trumpTotals)[names(trumpTotals) == 'audited'] <- 'trump_audited'
trumpTotals <- trumpTotals[,c('state','trump_audited')]
presTotals <- merge(presTotals, bidenTotals, by=c('state'))
presTotals <- merge(presTotals, trumpTotals, by=c('state'))
presTotals <- presTotals[order(-presTotals$total_audited),]
#print(presTotals)

#cat("Total Presidential, Biden, and Trump votes audited:\n") 
#print(c(sum(presTotals$total_audited),
#        sum(presTotals$biden_audited),
#        sum(presTotals$trump_audited)))


################################################################################
# Descriptive stats
################################################################################
png(paste0(FIG_DIR,'FIG2a_votes_by_state.png'), width=8, height=12,
    units='in', res=300)
par(mar=c(5,10,3,3))
countByState <- aggregate(list(cand$audited),
		      	  by=list(cand$state),
		      	  FUN=sum)
names(countByState) <- c('state','votes')
ballotsByState <- aggregate(list(ball$ballots),
		     	    by=list(ball$state),
		     	    FUN=sum)
names(ballotsByState) <- c('state','ballots')
for(state in unique(ballotsByState$state)){
  if (!(state %in% unique(countByState$state))){
    newRow <- data.frame(state,
			 ballotsByState$ballots[ballotsByState$state == state]
    			)
    names(newRow) <- c('state','votes')
    countByState <- rbind(countByState,newRow)
  } else {
  countByState$votes[countByState$state == state] <-
	  countByState$votes[countByState$state == state] +
	  ballotsByState$ballots[ballotsByState$state == state]
  }
}
countByState <- countByState[order(countByState$votes),]
barplot(height = log10(countByState$votes),
        main = 'Votes audited by state',
        cex.main = 2,
	horiz = TRUE,
        col = rgb(1, 105/255, 180/255, 0.75),
	xaxt = 'n',
	yaxt = 'n',
	xlab = expression(Total~votes~audited~(log[10])),
	cex.lab = 1.5,
	xlim = c(0,8)
       )
xLocs <- seq(0,max(log10(countByState$votes))+1)
xLabs <- paste0("10^",xLocs)
axis(side = 1,
     at = xLocs,
     labels = parse(text=xLabs),
     las = 1,
     cex.axis = 1.5
     )
# I believe the bars have width 1, and are 0.2 separated. So we need to start
# at 0.7 = 0.5 + 0.2. Then for every increment we need to increase by 1.2
countByState$state[countByState$state == 'DISTRICT OF COLUMBIA'] <- 
					 'WASHINGTON, D.C.'
axis(side = 2,
     at = seq(0.7,nrow(countByState)+5,by=1.2),
     labels = countByState$state,
     las = 2,
     font = 2
     )
XMAX <- max(log10(countByState$votes),na.rm=TRUE)+1
blockCount <- XMAX
for(sep in seq(0, blockCount)){
  abline(v=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.25),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.1))
  }
}
quietly <- dev.off()

png(paste0(FIG_DIR,'FIG2b_offices_by_state.png'), width=8, height=12,
    units='in', res=300)
par(mar=c(5,10,3,3))
officesByState <- aggregate(office ~ state,
		      	    cand,
		      	    function(x) length(unique(x)))
names(officesByState) <- c('state','offices')
officesByState <- officesByState[order(officesByState$offices),]
barplot(height = officesByState$offices,
        main = 'Distinct offices audited by state',
        cex.main = 2,
	horiz = TRUE,
        col = rgb(1, 105/255, 180/255, 0.75),
	xlim = c(0,200),
	cex.axis = 1.25,
	yaxt = 'n',
	xlab = "Total offices audited",
	cex.lab = 1.5
       )
officesByState$state[officesByState$state == 'DISTRICT OF COLUMBIA'] <- 
			                     'WASHINGTON, D.C.'
axis(side = 2,
     at = seq(0.7,nrow(officesByState)*1.2,by=1.2),
     labels = officesByState$state,
     las = 2,
     font = 2
     )
for(sep in seq(0, 200, by=50)){
  abline(v=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.25),lty='solid')
}
text(officesByState$offices,
     x = officesByState$offices + 10,
     y = seq(0.7,nrow(countByState)*1.2-3,by=1.2),
     font = 2,
     xpd = TRUE,
     cex = 1.15
     ) 
quietly <- dev.off()


################################################################################
# Plot the number of votes per county as a kernel density plot
################################################################################
trumpAu <- au[au$candidate == 'DONALD J TRUMP',]
trumpByCounty <- aggregate(list(trumpAu$audited),
                          by=list(trumpAu$state,trumpAu$county,trumpAu$town),
                          FUN=sum)
names(trumpByCounty) <- c('state','county','town','votes')
trumpByCounty$county[trumpByCounty$county == ''] <-
  trumpByCounty$town[trumpByCounty$county == '']
trumpByCounty <- trumpByCounty[,c('state','county','votes')]


ballotsByCounty <- aggregate(list(ball$ballots),
                            by=list(ball$state,ball$county,ball$town),
                            FUN=sum)
names(ballotsByCounty) <- c('state','county','town','votes')
ballotsByCounty$county[ballotsByCounty$county == ''] <-
  ballotsByCounty$town[ballotsByCounty$county == '']
ballotsByCounty <- ballotsByCounty[,c('state','county','votes')]

auDensity <- density(log10(trumpByCounty$votes), bw='sj')
png(paste0(FIG_DIR,'SUPP1a_votes_density.png'), width=6, height=6,
    units='in', res=300)
plot(auDensity,
     main = "Kernel density of audited Trump votes per county",
     xlab = expression(Number~of~Trump~votes~(log[10])),
     xaxt = 'n'
     )
xLocs <- seq(1, round(log10(max(trumpByCounty$votes))))
xLabs <- paste0("10^",xLocs)
axis(side=1,at=xLocs, labels=parse(text=xLabs))
grid()
quietly <- dev.off()

ballDensity <- density(log10(ballotsByCounty$votes), bw='sj')
png(paste0(FIG_DIR,'SUPP1b_ballots_density.png'), width=6, height=6,
    units='in', res=300)
plot(ballDensity,
     main='Kernel density of audited ballots per county',
     xlab = expression(Number~of~ballots~(log[10])),
     xaxt = 'n'
     )
xLocs <- seq(1, round(log10(max(ballotsByCounty$votes))))
xLabs <- paste0("10^",xLocs)
axis(side=1,at=xLocs, labels=parse(text=xLabs))
grid()
quietly <- dev.off()

#The following county-level features are not directly cited in the paper and
# are commented out of the printout for ease of reading. Uncomment to view
# them in the analysis log file
#cat("Number of counties that reported candidate-level audits with Trump:\n")
numCountiesTrump <- nrow(trumpByCounty)
#cat(paste0("\t",ClnNum(numCountiesTrump),"\n\n"))

#cat("Median audited trump votes by county:\n")
medTrumpCounty <- median(trumpByCounty$votes)
#cat(paste0("\t",ClnNum(medTrumpCounty),"\n\n"))

#cat("Largest audited trump votes by county:\n")
maxTrumpCounty <- max(trumpByCounty$votes)
#cat(paste0("\t",ClnNum(maxTrumpCounty),"\n\n"))

#cat("Number of counties that reported ballot-level audits:\n")
numCountiesBallot <- nrow(ballotsByCounty)
#cat(paste0("\t",ClnNum(numCountiesBallot),"\n\n"))

#cat("Median ballots by county:\n")
medBallotsCounty <- median(ballotsByCounty$votes)
#cat(paste0("\t",ClnNum(medBallotsCounty),"\n\n"))

#cat("Largest audited ballots by county:\n")
maxBallotsCounty <- max(ballotsByCounty$votes)
#cat(paste0("\t",ClnNum(maxBallotsCounty),"\n\n"))


################################################################################
# Plot the number of changes
################################################################################
png(paste0(FIG_DIR,'FIG4_num_changes.png'), width=12, height=8, units='in', res=300)
par(mfrow=c(2,3),mar=c(2.5,7,10,1))
for(office in PARTISAN_OFFICES){
  currDf <- cand[cand$office == office,]
  voteSums <- aggregate(list(currDf$original,currDf$audited),
  		      by=list(currDf$party),
  		      FUN=sum)
  names(voteSums) <- c('party','original','audited')
  voteSums <- as.matrix(voteSums)
  rownames(voteSums) <- voteSums[,'party']
  for(party in PTY_ORDER){
    if(!(party %in% row.names(voteSums))){
      voteSums <- rbind(voteSums,c('',0,0))
      row.names(voteSums)[nrow(voteSums)] <- party
    }
  }
  voteSums <- voteSums[PTY_ORDER,]
  voteSums <- voteSums[,c('original','audited')]
  bothNames <- list(row.names(voteSums),colnames(voteSums))
  voteSums <- matrix(as.numeric(voteSums),
		     ncol = ncol(voteSums),
		     dimnames = bothNames
  		    )
  voteSums <- t(voteSums)
 
  MAX_Y <- max(voteSums)
  increment <- MAX_Y/NUM_PLOT_LOCS
  barplot(height = voteSums,
  	  beside = TRUE,
	  ylim = c(0,MAX_Y),
	  main = office,
	  cex.main = 2,
	  xaxt = 'n',
	  yaxt = 'n',
  	  col = COL_ORDER_GROUP
         )
  locs <- seq(0, MAX_Y, by = increment)
  locs <-round(locs,0)
  labs <- GenLabs(locs)
  labs[locs == 0] <- "0"
  axis(2,
       at = locs,
       labels = labs,
       las = 1,
       cex.axis = 1.5
       )
  colnames(voteSums) <- MakePtysShort(colnames(voteSums))
  xLabLocs <- seq(ncol(voteSums))*3-1
  axis(1,
       at = xLabLocs,
       labels = colnames(voteSums),
       cex.axis = 1.25,
       las = 1,
       font = 2
       )
  diffs <- voteSums[2,] - voteSums[1,]
  propDiffs <- diffs/voteSums[2,]
  pctDiffs <- round(propDiffs*100,1)
  pctDiffs[is.na(pctDiffs)] <- 0
  pctDiffs <- format(pctDiffs,nsmall=1)
  diffLabs <- paste0(pctDiffs,'%')
  diffLabs <- gsub(' ','',diffLabs)
  diffLabs <- sapply(diffLabs, function(x)
		     	       if (substr(x,1,1) == '-') x
			       else paste0('+',x)
  		    )
  diffLabs <- unname(diffLabs)
  text(diffLabs,
       x=xLabLocs,
       y=apply(voteSums,2,max)+increment/2,
       xpd = TRUE,
       cex = 1.5
       )
}
quietly <- dev.off()


################################################################################
# Plot candidate-level changes
################################################################################
cats <- unique(cand$office_cat)
#MULTI is not interesting for this purpose
cats <- cats[cats != 'MULTI']
candRes <- aggregate(list(cand$original,cand$audited),
  		      by=list(cand$cand_id,cand$office_cat,cand$party),
  		      FUN=sum)
names(candRes) <- c('cand','office_cat','party','original','audited')
candRes$diff <- candRes$audited - candRes$original
candRes$propDiff <- candRes$diff/candRes$audited
candRes <- candRes[candRes$cand != '',]
write.csv(candRes,'/home/sbaltz/documents/audits/analysis/candRes.csv',
	  row.names=FALSE)
#cat("MEDIAN CAND-LEVEL PROPORTIONAL DIFFERENCE:\n")
#cat("\t",median(abs(candRes$propDiff[candRes$audited >= 1000])),"\n")
#cat("NUMBER AND SHARE OF CANDS WITH NO DIFFERENCE:\n")
#cat("\tCount: ",sum(candRes$diff == 0),"\n",
#    "\tTotal:",nrow(candRes),"\n",
#    "\tShare: ",sum(candRes$diff == 0)/nrow(candRes),
#    "\n")
png(paste0(FIG_DIR,'FIG5_cand_diffs_loglog.png'), width=16, height=10, units='in', res=300)
par(mar=c(7,7,2,5),
    mgp=c(3,1.33,0)
   )
#for(cat in cats){
currDf <- candRes#[candRes$office_cat == cat,]
currDf$logAudited <- log10(currDf$audited)
currDf$logAudited[currDf$audited == 0] <- 0
currDf$logOriginal <- log10(currDf$original)
currDf$logOriginal[currDf$original == 0] <- 0
currDf$logPropDiff <- log10(currDf$propDiff)
#currDf$logPropDiff[currDf$propDiff == 0] <- 0
currDf$logDiff <- log10(currDf$diff)
currDf$logDiff[currDf$diff == 0] <- 0
#Assign zeroes to what will be the minimum y-axis value on the log-log plot
#currDf$logPropDiff[currDf$propDiff == 0] <- -8
XMAX <- 7
YMIN <- -8
plot(x = currDf$logAudited,
     y = currDf$logPropDiff,
     cex = 1.5,
     pch = 21,
     col = 'white',
     ylim = c(YMIN+1.3,-0.2),
     xlim = c(0.3,XMAX-0.3),
     yaxt = 'n',
     xaxt = 'n',
     ylab = "",
     xlab = "",
     cex.lab = 2
)
text(x = -0.55,
     y = -4.75,
     labels = "Number of unchanged totals (bars)",
     xpd = TRUE,
     cex = 2,
     srt = 90
    )
text(x = 7.25,
     y = -2.25,
     labels = "Error rate for changed totals (circles)",
     xpd = TRUE,
     cex = 2,
     srt = 270
    )
mtext(side=1, text="Number of votes audited", line=5, cex=2)
blockCount <- 8
for(sep in seq(0, blockCount)){
  abline(h=-sep,lwd=1.25,col=rgb(0.2,0.2,0.2,0.5),lty='solid')
  abline(v=sep,lwd=1.25,col=rgb(0.2,0.2,0.2,0.5),lty='solid')
  newYLocs <- log10(seq(1,10) * 10**-sep)
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(yLoc in newYLocs){
    abline(h=yLoc,col=rgb(0.3,0.3,0.3,0.25))
  }
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.25))
  }
}
yLocs <- seq(YMIN,0,by=1)
for(y in yLocs){
  abline(h=y,col='black',lwd=4)
}
#Overwrite the log lines
polygon(x = c(0,-10,XMAX+1,XMAX),
        y = c(0,-10,-10,-7),
        col = 'white',
        border = 'transparent'
	)

#Add bars for zero difference frequencies
noChangeDf <- currDf[currDf$propDiff == 0,]
ncCount <- data.frame(table(noChangeDf$audited))
names(ncCount) <- c('audited','freq')
#ncCount$freq <- YMIN+1+ncCount$freq/10 #Transform onto the other axis
ncCount$audited <- as.numeric(levels(ncCount$audited))[ncCount$audited]
#Transform the count attached to exact log audited totals to visually nice bins

binFreqs <- data.frame()
linSteps <- seq(0,6)
maxCount <- 0
COUNT_DIV_FACT <- 50 #Automatically squishes the bars down to the plot scale
for(step in linSteps){
  for(coef in seq(9)){
    binLow <- coef*10**step
    binHigh <- (coef+1)*10**step
    inBin <- ncCount[(ncCount$audited >= binLow) &
		     (ncCount$audited < binHigh),]
    if(nrow(inBin) == 0){
      count <- 0
    } else {
      count <- sum(inBin$freq) 
    }
    if (count > maxCount){
      maxCount <- count
    }
    #Transform the count
    count <- YMIN + 1 + count/COUNT_DIV_FACT
    rect(xleft = log10(binLow),
         xright = log10(binHigh),
         ytop = count,
         ybottom = YMIN,
         col = rgb(1,0.25,0.55,0.75),
         border = 'black'
        )
  }
}
yLocs <- YMIN + 1 + seq(0,maxCount+25,by=25)/COUNT_DIV_FACT
yLabs <- seq(0,maxCount+25,by=25)
axis(2,
     at = yLocs,
     labels = yLabs,
     las = 1,
     cex.axis = 2
    )

for(sep in seq(0, blockCount)){
  abline(v=sep,lwd=1.25,col=rgb(0.2,0.2,0.2,0.5),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.25))
  }
}
#Replace the vertical lines for number of votes audited
yLocs <- seq(YMIN,0,by=1)
yLabs <- paste0("10^",yLocs)
yLabs[length(yLabs)] <- 1
yLabs <- parse(text=yLabs)
text(y=-0.3,x=0.1,labels="1",cex=2)
yLocs <- yLocs[1:length(yLocs)-1]
yLabs <- yLabs[1:length(yLabs)-1]
text(y = yLocs-0.25,
     x = -yLocs-0.25,
     labels = yLabs,
     cex = 2
)
xLocs <- seq(0,XMAX,by=1)
xLocs[1] <- xLocs[1] + 0.05
xLabs <- paste0("10^",xLocs)
xLabs <- parse(text=xLabs)
xLabs[1] <- 1
axis(1,
     at = xLocs,
     labels = xLabs,
     las = 1,
     cex.axis = 2
     )
segments(x0=0, x1=XMAX, y0=0, y1=YMIN+1, col = 'black', lwd=7)
points(x = currDf$logAudited[!(currDf$party %in% c('DEMOCRAT','REPUBLICAN'))],
       y = currDf$logPropDiff[!(currDf$party %in% c('DEMOCRAT','REPUBLICAN'))],
       bg = rgb(1,0,1,0.4),
       col = 'black',
       cex = 2.5,
       pch = 21
	)
points(x = currDf$logAudited[currDf$party == 'DEMOCRAT'],
       y = currDf$logPropDiff[currDf$party == 'DEMOCRAT'],
       bg = rgb(57/255,127/255,255/255,0.4),
       col = 'black',
       cex = 2.5,
       pch = 21
	)
points(x = currDf$logAudited[currDf$party == 'REPUBLICAN'],
       y = currDf$logPropDiff[currDf$party == 'REPUBLICAN'],
       bg = rgb(255/255,57/255,57/255,0.4),
       col = 'black',
       cex = 2.5,
       pch = 21
	)
quietly <- dev.off()


################################################################################
# Plot the change in Trump-Biden margin between original and audited counts
################################################################################
pres <- cand[cand$candidate == 'JOSEPH R BIDEN' |
	     cand$candidate == 'DONALD J TRUMP',]
pres$geoid <- paste0(pres$state,'_',pres$county,'_',pres$township)
pres <- pres[c('geoid','candidate','original','audited')]
presVotes <- aggregate(list(pres$original,pres$audited),
  		       by=list(pres$geoid,pres$candidate),
  		       FUN=sum)
names(presVotes) <- c('geoid', 'candidate', 'original', 'audited')
biden <- presVotes[presVotes$candidate == 'JOSEPH R BIDEN',]
names(biden)[names(biden) == 'original'] <- 'biden_original'
names(biden)[names(biden) == 'audited'] <- 'biden_audited'
trump <- presVotes[presVotes$candidate == 'DONALD J TRUMP',]
names(trump)[names(trump) == 'original'] <- 'trump_original'
names(trump)[names(trump) == 'audited'] <- 'trump_audited'
bothPres <- merge(biden, trump, by='geoid')

# We will compute Delta in the text, and we will call that variable mDiffProp:
#
#             (trump_audited - biden_audited)-(trump_original - biden_original)
# mDiffProp = ------------------------------------------------------------------
#                              biden_audited + trump_audited
#
bothPres$origMargin <- bothPres$trump_original - bothPres$biden_original
bothPres$auditMargin <- bothPres$trump_audited - bothPres$biden_audited
bothPres$marginDiff <- bothPres$auditMargin - bothPres$origMargin
bothPres$mDiffProp <- bothPres$marginDiff/(bothPres$biden_audited+
                                           bothPres$trump_audited)

cat("LARGEST SHIFT TOWARDS TRUMP IN ANY COUNTY:\n")
maxTrumpShift <- max(bothPres$mDiffProp)
cat(paste0("\t",ClnNum(maxTrumpShift),"\n\n")) 

cat("LARGEST SHIFT TOWARDS BIDEN IN ANY COUNTY:\n")
maxBidenShift <- min(bothPres$mDiffProp)
cat(paste0("\t",ClnNum(maxBidenShift),"\n\n")) 

colours <- rep(NA,length(bothPres$marginDiff))
trumpCol <- rgb(1,0,0,0.1)
bidenCol <- rgb(0,0,1,0.1)
neitherCol <- rgb(0.3,0.3,0.3,0.75)
colours[bothPres$marginDiff < 0] <- bidenCol
colours[bothPres$marginDiff > 0] <- trumpCol
colours[bothPres$marginDiff == 0] <- neitherCol

yExtreme <- max(abs(min(bothPres$mDiffProp)),
		abs(max(bothPres$mDiffProp)))

bothPres$total_audited <- bothPres$biden_audited + bothPres$trump_audited

#The mean county-level shift, Delta in the text (divided by 100), is the mean of
# the mDiffProp column
cat("MEAN COUNTY-LEVEL SHIFT TOWARDS TRUMP: \n\t",
    mean(bothPres$mDiffProp),"\n\n")

#cat("Median county-level shift towards Trump: \n\t",
#    median(bothPres$mDiffProp),"\n")

#The total national shift, Delta in the text (divided by 100), is the mean of
# the mDiffProp column
totalAuditMargin <- sum(bothPres$trump_audited) - sum(bothPres$biden_audited)
totalOrigMargin <- sum(bothPres$trump_original) - sum(bothPres$biden_original)
totalAudited <- sum(bothPres$trump_audited + bothPres$biden_audited)
totalPropShift <- (totalAuditMargin - totalOrigMargin) / totalAudited
cat("OVERALL PROPORTIONAL SHIFT TOWARDS TRUMP:\n\t",totalPropShift,"\n")

#Let's try some more ways of visualizing this quantity. First,
# build a matrix where each column is an observed proportional difference in
# margin, and the rows are the number of votes in counties with that difference
diffByCty <-bothPres[,c('total_audited','mDiffProp')]
diffMat <- as.matrix(table(diffByCty))
for(i in seq(nrow(diffMat))){
  diffMat[i,] <- diffMat[i,] * as.numeric(row.names(diffMat)[i])
}
#Now each column name of the matrix is an x-axis position. Each value is the
# amount to add to the previous total, so we need a cumulative total up to each
# row
sumDiffs <- diffMat
for(i in seq(2,nrow(sumDiffs))){
  sumDiffs[i,] <- sumDiffs[i,] + sumDiffs[i-1,]
}
png(paste0(FIG_DIR,'FIG3_margins_by_county.png'), width=16, height=10,
    units='in', res=300)
par(mar=c(7,11,5,2))
YMIN <- -0.0075
YMAX <- 0.0075
YRANGE <- YMAX - YMIN
XMIN <- log10(min(sumDiffs[sumDiffs!=0]))
XMAX <- log10(max(sumDiffs))
plot(x=seq(XMIN, XMAX),
     xlim = c(XMIN, XMAX),
     ylim = c(YMIN, YMAX),
     col = 'white',
     yaxt = 'n',
     xaxt = 'n',
     xlab = '',
     ylab = '',
     main = 'Change by County in Trump-Biden Margin Due to Audit',
     cex.main = 2,
     bty = "n"
     )
text("County-level shift in margin",
     x = XMIN-0.8,
     y = 0,
     xpd = TRUE,
     cex = 2,
     srt = 90,
)
text(parse(text="Towards~Trump %->% phantom(.)"),
     x = XMIN-0.6,
     y = 0.006,
     xpd = TRUE,
     cex = 1.75,
     srt = 90,
     )
text(parse(text="phantom(.) %<-% Towards~Biden"),
     x = XMIN-0.6,
     y = -0.006,
     xpd = TRUE,
     cex = 1.75,
     srt = 90,
)
text("Number of votes, stacked wherever counties had identical shifts",
     x = (XMIN+XMAX)/2,
     y = YMIN-0.003,
     xpd = TRUE,
     cex = 2,
     srt = 0,
)
blockCount <- XMAX*1.1
for(sep in seq(XMIN, blockCount)){
  abline(v=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.5),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.15))
  }
}
for(yVal in seq(-0.008,0.008,by=0.001)){
  abline(h=yVal,lwd=2,lty='solid',col=rgb(0.1,0.1,0.1,0.15))
}
yLocs <- seq(-0.008, 0.008, by=0.001)
yLabs <- yLocs * 100
yLabs <- ClnNum(yLabs)
yLabs <- paste0(yLabs, "%")
axis(2,
     at = yLocs,
     labels = yLabs,
     las = 1,
     cex.axis = 1.75
)
xLocs <- seq(0, XMAX*1.1, by = 1)
xLabs <- parse(text=paste0("10^",xLocs))
axis(1,
     at = xLocs,
     labels = xLabs,
     las = 1,
     cex.axis = 1.75
)
#Plot everything except zero, so that zero shows up on top, since it has the
# most structure that readers may want to perceive
for(l in labels(sumDiffs)$mDiffProp[as.numeric(labels(sumDiffs)$mDiffProp)!=0]){
  minx <- 0
  for (r in seq(nrow(sumDiffs))){
    if (sumDiffs[r,l] != 0) {
      currSum <- log10(sumDiffs[r,l])
      currDiff <- as.numeric(l)
      if(as.numeric(l) < 0){
        currCol <- bidenCol
      } else if (as.numeric(l) > 0) {
        currCol <- trumpCol
      }
      rect(xleft = minx,
           xright = currSum,
           ytop = currDiff + YRANGE/150,
           ybottom = currDiff - YRANGE/150,
           col = currCol,
           border = currCol
      )
      minx <- currSum
    }
  }
}
#Now add the zero column, visually on top of the others
l <- "0"
minx <- 0
for (r in seq(nrow(sumDiffs))){
  currSum <- log10(sumDiffs[r,l])
  currDiff <- as.numeric(l)
  currCol <- rgb(0,0,0,0.25)
  rect(xleft = minx,
       xright = currSum,
       ytop = currDiff + YRANGE/150,
       ybottom = currDiff - YRANGE/150,
       col = rgb(1,0,1,0.75), #Could also use neitherCol here
       border = 'black'
  )
  minx <- currSum
}
quietly <- dev.off()


################################################################################
# Plot the distribution of changes in the pres margin as a kernel density plot
################################################################################
diffDensity <- density(bothPres$mDiffProp)
png(paste0(FIG_DIR,'SUPP2_pres_shift_density.png'), width=6, height=6,
    units='in', res=300)
plot(diffDensity,
     main = "Kernel density of County-Level Shift in Presidential Margin",
     xlab = "Net Shift Towards Trump at County Level",
)
grid()
quietly <- dev.off()


################################################################################
# Plot error rate for total number of ballots counted by state
################################################################################
xVals <- log10(totals$audited)
yVals <- log10(abs(totals$difference)) * sign(totals$difference)
yVals[totals$difference == 0] <- 0
cols <- c('#174EA6','#A50E0E','#E37400','#0D652D','#4285F4',
          '#EA4335','#FBBC04','#34A853','#D2E3FC','#FAD2CF',
          '#FEEFC3','#CEEAD6','#202124','#9AA0A6')
png(paste0(FIG_DIR,'SUPP3_total_ballot_changes.png'), width=13, height=8,
    units='in', res=300)
par(mar=c(5,5,3,3))
plot(xVals,
     yVals,
     pch = 21,
     bg = 'white',
     col = 'white',
     ylim = c(-4,4),
     xaxt = 'n',
     yaxt = 'n',
     ylab = '',
     xlab = 'Total Number of Ballots Audited',
     main = 'Change in Total Number of Ballots Due to Audit',
     cex.lab = 1.25,
     cex.main = 1.5
    )
mtext(side=2, text="Change in Total Number of Ballots", line=3.75, cex=1.25)
abline(h=0, lwd=2)
XMAX <- max(xVals)+1
blockCount <- XMAX
for(sep in seq(0, blockCount)){
  abline(v=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.5),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.15))
  }
}

for(sep in seq(-4,4)){
  abline(h=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.5),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(h=xLoc,col=rgb(0.3,0.3,0.3,0.15))
  }
}
states <- sort(unique(totals$state))
for(i in seq(length(states))){
  state <- states[i]
  points(xVals[totals$state == state],
         yVals[totals$state == state],
         bg = adjust_transparency(cols[i],0.65),
         col = rgb(0,0,0,0.25),
         pch = 21,
         cex = 1.5
  )
}
xLocs <- seq(0, XMAX, by = 1)
xLabs <- parse(text=paste0("10^",xLocs))
axis(1,
     at = xLocs,
     labels = xLabs,
     las = 1,
     cex.axis = 1.1
)
yLocs <- seq(-4,4,by=1)
yLabs <- parse(text=c("-10^4","-10^3","-10^2","-10^1","0",
                      "10^1","10^2","10^3","10^4"))
axis(2,
     at = yLocs,
     labels = yLabs,
     las = 1,
     cex.axis = 1.1
)
quietly <- dev.off()


################################################################################
# Plot discrepancy rate in ballot data
################################################################################
ballIss <-aggregate(list(ball$ballots,ball$issues),
          by=list(ball$state,ball$county),
          FUN=sum)
names(ballIss) <- c('state','county','ballots','issues')
png(paste0(FIG_DIR,'SUPP4a_ballot_discrepancies.png'), width=13, height=8,
    units='in', res=300)
plot(log10(ballIss$ballots),
     ballIss$issues,
     pch = 19,
     bg = 'white',
     col = 'white',
     xaxt = 'n',
     yaxt = 'n',
     ylab = 'Number of Discrepancies Found',
     xlab = 'Total Number of Ballots Audited',
     main = 'Number of Discrepancies Found in Ballot-Level Audits',
     cex.lab = 1.25,
     cex.main = 1.5
)
XMAX <- max(log10(ballIss$ballots))+1
blockCount <- XMAX
for(sep in seq(0, blockCount)){
  abline(v=sep,lwd=1.25,col=rgb(0.1,0.1,0.1,0.5),lty='solid')
  newXLocs <- log10(seq(1,10) * 10**sep)
  for(xLoc in newXLocs){
    abline(v=xLoc,col=rgb(0.3,0.3,0.3,0.15))
  }
}
xLocs <- seq(0, XMAX, by = 1)
xLabs <- parse(text=paste0("10^",xLocs))
axis(1,
     at = xLocs,
     labels = xLabs,
     las = 1,
     cex.axis = 1.1
)
yLocs <- seq(0,max(ballIss$issues)+5,by=10)
yLabs <- yLocs
axis(2,
     at = yLocs,
     labels = yLabs,
     las = 1,
     cex.axis = 1.1
)
for(l in yLocs){
  abline(h=l,lwd=1.25,col=rgb(0.1,0.1,0.1,0.25),lty='solid')
}
points(log10(ballIss$ballots),
     ballIss$issues,
     pch = 19,
     bg = rgb(0,0,0,0.5),
     col = adjust_transparency('#E37400', 0.2)
)
quietly <- dev.off()

ballDensity <- density(ballIss$issues)
png(paste0(FIG_DIR,'SUPP4b_ballot_density.png'), width=6, height=6,
    units='in', res=300)
plot(ballDensity,
     main = "Kernel density of Number of Ballot-Level Discrepancies",
     xlab = "Number of Discrepancies",
)
grid()
quietly <- dev.off()


#cat("Proportion of ballot-level issues that are exactly 0:\t")
#cat(sum(ballIss$issues == 0) / nrow(ballIss))

sink()
